## Saving rate in other assets using HES

## output: share of income not used in consumption, by age and income group

## HILDA vs HES expenditure https://blog.grattan.edu.au/2019/05/using-the-right-survey-to-measure-retiree-spending/

## note expenditure is hh level
## we want inc at an indiv level


# Prelims -----------------------------------------------------------------


rm(list=ls())
gc()

sihhes_path <- "./SIH HES 2015-16/HHExpend2015-16_Stata/"

hes_p_path <- "HES15BP.dta"
hes_x_path <- "HES15BX.dta"



# Read SIH/HES data ---------------------------------------------------

sih_vars <- c(
  "abshid", ## hh id
  "abspid", ## person id
  "hespswt", ## person weight
  "ageec", ## age of person
  
  
  ## INCOME VARS - need total income after tax, excluding inc from dividends, interest (assumed reinvested in other inc via RoR)... and super which is modelled via drawdown
  "inctscp8", ## Total current weekly income from all sources
  "isupercp", ## Current weekly income from superannuation/annuity/private pension
  "idivtrcp", ## Current weekly income from dividends incl franking credits
  "infinrcp", ## Current weekly income from financial institution account interest (excl offset accounts) (reported)
  "indebrcp", ## Current weekly income from interest on debentures and bonds (reported)
  
  "itaxcp8",   ## Imputed current weekly tax payable
  
  ## HOUSING
  "tenurpcf"

)

## person data
hes_p <- read_dta(paste0(sihhes_path, hes_p_path),
                  col_select= sih_vars)

## expenditure data
hes_x <- read_dta(paste0(sihhes_path, hes_x_path),
                col_select = c(abshid, comcode, wklyexp))




# Read HILDA starting cohorts ---------------------------------------------

## this is to get proportions of each age group in each income group - same props will be applied in determining which inc qtile hes people belong to

starting_cohorts <- qread("./Input data/starting_cohorts_aiwbh_p.qs") ##

age_grp <- starting_cohorts %>% 
  distinct(age_grp)

inc_grps <- 5


# Calc total relevant expenditure -----------------------------------------

## separate income tax to remove this from the income side too
hes_exp <- hes_x %>% 
  ## note there are some negative expenditure due to sales - 
  ## "Where trade-ins, sales and insurance claims exceed the costs of acquisitions of the same expenditure item, expenditure is recorded as negative. For example, if someone sells a luxury motor vehicle and buys a less costly model, the amount of expenditure recorded in the HES would be negative."
  ## remove negative expenditure. do not add to income as this tends to be irregular income
  filter(wklyexp>0) %>% 
  mutate(exp_type = ifelse(comcode %in% c(1601010101, 1901010101), "excluded", "exp")) %>% ## 1601010101 income tax, 1901010101 superannuation
  group_by(abshid, exp_type) %>% 
  summarise(wklyexp = sum(wklyexp)) %>% 
  pivot_wider(names_from= "exp_type", values_from="wklyexp")

## income tax check - pretty close
# hes_tax_check <- hes_p %>% 
#   group_by(abshid) %>% 
#   summarise(tax_p = sum(itaxcp8)) %>% 
#   left_join(hes_exp)


# Calc proportion of each age group in each inc qtile ---------------------

age_inc_prop <- starting_cohorts %>% 
  ## age grp in HES format - grouping 65+
  mutate(age_grp_hes = ifelse(age_grp>="[65,70)", "[65,70]", as.character(age_grp))) %>% 
  ## summarise n by hes age grp and inc qtile
  group_by(age_grp_hes, total_inc_qtile) %>% 
  summarise(n=sum(n)) %>% 
  ## proportions of each age group - 
  group_by(age_grp_hes) %>% 
  mutate(prop = n/sum(n)) ## this determines the cut points for income group for HES


## age grp concordance
age_grp_concord <- starting_cohorts %>% 
  ## age grp in HES format
  mutate(age_grp_hes = ifelse(age_grp>="[65,70)", "[65,70]", as.character(age_grp))) %>% 
  distinct(age_grp, age_grp_hes)



# Calc non-consumed income ------------------------------------------------

hes_saving <- hes_p %>% 
  ## calc hh net inc after tax, and excl income from dividends, interest, super for consistency with model (dividends and interest assumed reinvested in other assets, and super assumed consumed)
  mutate(netincome = inctscp8 - itaxcp8 - isupercp - idivtrcp - infinrcp - indebrcp,
         ## inc pre tax which is used for determining inc qtiles in model
         total_inc = inctscp8 - isupercp - idivtrcp - infinrcp - indebrcp,
         ## homeowner id w & wo mortgage
         homeowner = ifelse(tenurpcf %in% c(1,2), 1, 0)
  ) %>% 
  left_join(hes_exp) %>% 
  ## apportion expenditure to individuals based on their share of post tax income
  group_by(abshid) %>% 
  ## for simplicity, remove all negative and 0 incomes (otherwise can get negative shares)
  filter(netincome>0) %>% 
  mutate(inc_share = netincome/ sum(netincome),
         exp_p0 = exp*inc_share, 
         ## if exp is more than inc, make it such that all inc is spent
         exp_p = ifelse(exp_p0>netincome, netincome, exp_p0) ) %>% 
  ungroup %>% 
  ## what share of net inc is not consumed per person - ie how much saved in "other"
  mutate(saving_rate_other = 1 - exp_p/netincome) %>% 
  
  ## age and inc group. Note inc group by quintile won't match HILDA version because hilda includes <15 year olds
  mutate(age_grp_hes = cut(ageec, breaks=seq(15, 90, 5), right=F, include.lowest=T, ordered_result=T),
         ## grouping some due to low numbers
         age_grp_hes = ifelse(age_grp_hes>="[65,70)", "[65,70]", as.character(age_grp_hes))
  ) %>% 
  
  ## calc inc group using HILDA proportions by age group
  split(., .$age_grp_hes) %>% 
  lapply(., function(x) {
    print(x[[1,"age_grp_hes"]])
    
    ## get cumulative probs for each inc qtile for this age group in HILDA
    inc_prop <- age_inc_prop %>% 
      ## filter to current age group
      filter(age_grp_hes == x[[1,"age_grp_hes"]] ) %>% 
      ## remove very small probs to enable clear cut points
      filter(prop>0.001) %>% 
      ## recalc prop based on removed values
      mutate(prop = n/sum(n)) %>% 
      arrange(total_inc_qtile) %>% 
      mutate(prop_cumu = cumsum(prop)) 
    
    probs <- c(0, inc_prop$prop_cumu) ## determines cut points in weighted quantile below
    labels <- inc_prop$total_inc_qtile ## inc quintile number
    
    output <- x %>% 
      mutate( total_inc_qtile = cut(total_inc, 
                                    ## cut points determined by weighted quantile (only works if cut points are unique)
                                    breaks = Hmisc::wtd.quantile(.$total_inc, 
                                                                 weights = .$hespswt, 
                                                                 probs = probs), 
                                    include.lowest=T,
                                    labels = labels,
                                    ordered_result=T)  ) %>% 
      ## fix in case of missing values at min or max total_inc
      mutate(total_inc_qtile = case_when(
        is.na(total_inc_qtile) & total_inc==min(total_inc) ~ min(labels),
        is.na(total_inc_qtile) & total_inc==max(total_inc) ~ max(labels),
        TRUE ~ total_inc_qtile
      ))
  }) %>% 
  rbindlist %>% 
  group_by(age_grp_hes, homeowner, total_inc_qtile) %>% 
  ## weighted average share of net income not consumed
  summarise(saving_rate_other = sum(saving_rate_other*hespswt)/sum(hespswt) )


## using model age groups:
hes_saving_age <- expand_grid(age_grp, total_inc_qtile=c(1:inc_grps), homeowner=c(0,1)) %>% 
  left_join(age_grp_concord) %>% 
  left_join(hes_saving %>% mutate(total_inc_qtile = as.numeric(total_inc_qtile))) %>% 
  group_by(age_grp, homeowner) %>% 
  mutate(saving_rate_other = case_when(
    is.na(saving_rate_other) & age_grp <="[10,15)" ~ 1, ## assume all is saved if under 15
    TRUE ~ saving_rate_other
  )) %>% 
  fill(saving_rate_other, .direction="downup") %>% 
  ungroup



## USED IN APPENDIX:
## Estimated saving rates from income into other assets ranged from 0 to 38 per cent for those aged 15 years and over. 
summary(hes_saving_age %>% filter(age_grp>="[15,20)"))

## They tended to be higher for higher income groups and lower for homeowners (who make mortgage repayments), all else equal. 
## Across age groups, saving rates into other assets tended to be highest when young, fell during middle age, and then 
## increased slightly again in old age.
ggplot(hes_saving_age %>% filter(age_grp>="[15,20)") %>% distinct(age_grp_hes, total_inc_qtile, homeowner, saving_rate_other) ) +
  geom_col(aes(x = total_inc_qtile, y = saving_rate_other, fill=as.factor(homeowner)), position="dodge") +
  facet_wrap(vars(age_grp_hes))


## for example, a 60–64 year old homeowner in the third income quintile was assumed to save
## about 12 per cent of their inheritance into other assets. 
hes_saving_age %>% filter(age_grp=="[60,65)" & homeowner==1 & total_inc_qtile==3)


## save output
qsave(hes_saving_age, "./Input data/saving_other_aih.qs")